## [1] "December 22, 2025"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.5.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
library(shapper)
## Warning: package 'shapper' was built under R version 4.5.2
data <- read.csv("data.csv")
nrow(data)
## [1] 925
summary(data)
## Ref. Limits.of.Potential.Window..V.
## Length:925 Length:925
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Lower.Limit.of.Potential.Window..V. Upper.Limit.of.Potential.Window..V.
## Min. :-1.1000 Min. :-0.2000
## 1st Qu.:-0.3000 1st Qu.: 0.4000
## Median : 0.0000 Median : 0.6000
## Mean :-0.2343 Mean : 0.6301
## 3rd Qu.: 0.0000 3rd Qu.: 0.8000
## Max. : 0.2000 Max. : 3.5000
## NA's :4 NA's :4
## Potential.Window..V. Current.Density..A.g. Capacitance..F.g.
## Min. :0.4000 Min. : 0.050 Min. : 1.4
## 1st Qu.:0.6000 1st Qu.: 1.000 1st Qu.: 148.6
## Median :0.8250 Median : 2.000 Median : 260.2
## Mean :0.8634 Mean : 5.857 Mean : 415.5
## 3rd Qu.:1.0000 3rd Qu.: 5.000 3rd Qu.: 509.9
## Max. :3.5000 Max. :200.000 Max. :3344.1
## NA's :5 NA's :16 NA's :17
## Specific.Surface.Area..m.2.g. Charge.Transfer.Resistance..Rct...ohm.
## Min. : 8.896 Min. : 0.080
## 1st Qu.: 57.000 1st Qu.: 0.670
## Median : 159.970 Median : 1.540
## Mean : 417.438 Mean : 3.048
## 3rd Qu.: 546.000 3rd Qu.: 3.240
## Max. :2400.000 Max. :24.200
## NA's :572 NA's :786
## Equivalent.Series.Resistance..Rs...ohm. Electrode.Configuration
## Min. : 0.200 Length:925
## 1st Qu.: 0.350 Class :character
## Median : 0.580 Mode :character
## Mean : 1.602
## 3rd Qu.: 2.000
## Max. :17.500
## NA's :772
## Pore.Size..nm. Pore.Volume..cm.3.g. Ratio.of.ID.IG N.at.
## Min. : 0.530 Min. :0.0200 Min. :0.120 Min. : 0.00
## 1st Qu.: 3.045 1st Qu.:0.1680 1st Qu.:0.940 1st Qu.: 0.00
## Median : 4.337 Median :0.2170 Median :1.050 Median : 0.00
## Mean : 8.618 Mean :0.4857 Mean :1.121 Mean : 2.50
## 3rd Qu.:13.625 3rd Qu.:0.5075 3rd Qu.:1.170 3rd Qu.: 3.20
## Max. :44.131 Max. :2.3500 Max. :2.900 Max. :23.82
## NA's :769 NA's :729 NA's :596 NA's :690
## C.at. O.at. Electrolyte.Chemical.Formula
## Min. : 1.40 Min. : 1.900 Length:925
## 1st Qu.:37.32 1st Qu.: 8.883 Class :character
## Median :81.00 Median :13.700 Mode :character
## Mean :66.52 Mean :19.176
## 3rd Qu.:85.58 3rd Qu.:27.098
## Max. :98.10 Max. :54.280
## NA's :699 NA's :703
## Electrolyte.Ionic.Conductivity Electrolyte.Concentration..M.
## Min. :1.000 Min. :0.100
## 1st Qu.:6.000 1st Qu.:1.000
## Median :6.000 Median :1.000
## Mean :5.806 Mean :2.576
## 3rd Qu.:7.000 3rd Qu.:6.000
## Max. :8.000 Max. :6.000
## NA's :99 NA's :62
## Cell.Configuration..three.two.electrode.system.
## Length:925
## Class :character
## Mode :character
##
##
##
##
Zasady czyszczenia danych:
keep_cols <- colSums(is.na(data)) < nrow(data) / 2
keep_cols["Limits.of.Potential.Window..V."] <- FALSE
medians <- sapply(data[, sapply(data, is.numeric)], median, na.rm = TRUE)
median_cols <- intersect(colnames(data)[keep_cols], names(medians))
data <- select(data, colnames(data)[keep_cols])
keep_rows <- rowSums(is.na(data)) < sum(keep_cols) / 2
data <- filter(data, keep_rows)
for (col in median_cols) {
data[is.na(data[, col]), col] <- medians[col]
}
str_strip <- function(x) {
x_sub <- x
while (substring(x_sub, 1, 1) == " ") {
x_sub <- substring(x_sub, 2)
}
while (substring(x_sub, nchar(x_sub)) == " ") {
x_sub <- substring(x_sub, 1, nchar(x_sub) - 1)
}
return(x_sub)
}
string_cols <- colnames(data)[!sapply(data, is.numeric)]
for (col in string_cols) {
data[, col] <- sapply(data[, col], str_strip)
tt <- table(data[, col])
max_val <- names(tt[tt == max(tt)])[1]
data[data[, col] == "", col] <- max_val
}
nrow(data)
## [1] 925
summary(data)
## Ref. Lower.Limit.of.Potential.Window..V.
## Length:925 Min. :-1.1000
## Class :character 1st Qu.:-0.3000
## Mode :character Median : 0.0000
## Mean :-0.2333
## 3rd Qu.: 0.0000
## Max. : 0.2000
## Upper.Limit.of.Potential.Window..V. Potential.Window..V. Current.Density..A.g.
## Min. :-0.2000 Min. :0.4000 Min. : 0.05
## 1st Qu.: 0.4000 1st Qu.:0.6000 1st Qu.: 1.00
## Median : 0.6000 Median :0.8250 Median : 2.00
## Mean : 0.6299 Mean :0.8632 Mean : 5.79
## 3rd Qu.: 0.8000 3rd Qu.:1.0000 3rd Qu.: 5.00
## Max. : 3.5000 Max. :3.5000 Max. :200.00
## Capacitance..F.g. Electrode.Configuration Electrolyte.Chemical.Formula
## Min. : 1.4 Length:925 Length:925
## 1st Qu.: 150.8 Class :character Class :character
## Median : 260.2 Mode :character Mode :character
## Mean : 412.6
## 3rd Qu.: 493.6
## Max. :3344.1
## Electrolyte.Ionic.Conductivity Electrolyte.Concentration..M.
## Min. :1.000 Min. :0.10
## 1st Qu.:6.000 1st Qu.:1.00
## Median :6.000 Median :1.00
## Mean :5.827 Mean :2.47
## 3rd Qu.:7.000 3rd Qu.:6.00
## Max. :8.000 Max. :6.00
## Cell.Configuration..three.two.electrode.system.
## Length:925
## Class :character
## Mode :character
##
##
##
for (col in colnames(data)) {
if (is.numeric(data[, col])) {
hist(data[, col], main = col,
xlab = "Wartość", ylab = "Wystąpienia")
} else if (!(col %in% c("Ref.", "Electrode.Configuration"))) {
tt <- table(data[, col])
val_cnt <- data.frame(vals = names(tt),
counts = as.numeric(tt))
print(ggplot(val_cnt, aes(x = vals, y = counts)) +
geom_bar(stat = "identity") +
labs(x = "Wartość", y = "Wystąpienia", title = col) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5)))
}
}
elec_conf_vallist <- unique(data$Electrode.Configuration)
elec_chem_vallist <- unique(data$Electrolyte.Chemical.Formula)
cell_conf_vallist <- unique(data$Cell.Configuration..three.two.electrode.system.)
data_w_enums <- data %>%
mutate(Electrode.Configuration = match(Electrode.Configuration, elec_conf_vallist)) %>%
mutate(Electrolyte.Chemical.Formula = match(Electrolyte.Chemical.Formula, elec_chem_vallist)) %>%
mutate(Cell.Configuration..three.two.electrode.system. = match(Cell.Configuration..three.two.electrode.system., cell_conf_vallist))
ggcorrplot(cor(select(data_w_enums, where(is.numeric))))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p <- ggplot(data) +
geom_linerange(aes(x = Capacitance..F.g.,
ymin = Lower.Limit.of.Potential.Window..V.,
ymax = Upper.Limit.of.Potential.Window..V.,
color = Cell.Configuration..three.two.electrode.system.),
alpha = 0.15) +
labs(y = "Potential.Window..V.", color = "Cell Configuration",
title = "Porównanie konfiguracji elektrod") +
coord_flip()
ggplotly(p)
n_list <- names(head(sort(table(data$Electrolyte.Chemical.Formula), decreasing = TRUE), 10))
data_filtered <- data %>%
filter(Electrolyte.Chemical.Formula %in% n_list)
p <- ggplot(data_filtered) +
geom_linerange(aes(x = Capacitance..F.g.,
ymin = Lower.Limit.of.Potential.Window..V.,
ymax = Upper.Limit.of.Potential.Window..V.,
color = Electrolyte.Chemical.Formula),
alpha = 0.2) +
labs(y = "Potential.Window..V.", color = "Chemical Formula",
title = "Porównanie najczęstszych składów chemicznych elektrolitu") +
coord_flip()
ggplotly(p)
# zmiana kolumn tekstowych na enumy
elec_conf_vallist <- unique(data$Electrode.Configuration)
elec_chem_vallist <- unique(data$Electrolyte.Chemical.Formula)
cell_conf_vallist <- unique(data$Cell.Configuration..three.two.electrode.system.)
ml_data <- data %>%
mutate(Electrode.Configuration = match(Electrode.Configuration, elec_conf_vallist)) %>%
mutate(Electrolyte.Chemical.Formula = match(Electrolyte.Chemical.Formula, elec_chem_vallist)) %>%
mutate(Cell.Configuration..three.two.electrode.system. = match(Cell.Configuration..three.two.electrode.system., cell_conf_vallist)) %>%
select(-Ref.)
# podział zbioru danych
set.seed(9001)
is_training <- createDataPartition(
y = ml_data$Capacitance..F.g.,
p = .75,
list = FALSE
)
training_data <- ml_data[is_training, ]
testing_data <- ml_data[-is_training, ]
# uczenie
ctrl <- trainControl(
method = "cv",
# number = 2,
# repeats = 10
)
set.seed(1337)
fit <- train(
Capacitance..F.g. ~ .,
data = training_data,
method = "lm",
trControl = ctrl
)
# predykcja
predictions <- predict(fit, newdata = select(testing_data, -Capacitance..F.g.))
# shap
shapped <- shap(fit, data = testing_data, new_observation = testing_data[1, ])
shapped <- shapped %>%
rename(attribution = "_attribution_", vname = "_vname_", sign = "_sign_") %>%
filter(vname != "Capacitance..F.g.")
# plot(shapped)
ggplot(shapped, aes(x = vname, y = attribution, fill = sign)) +
geom_bar(stat = "identity") + coord_flip() +
geom_text(aes(label = round(attribution, 2), hjust = "inward")) +
guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.